from urllib import request
from bs4 import BeautifulSoup
import re
import mysql
import time

fundSharesList = []

head={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}

find = {
    1:re.compile(r'html">(.*?)</a></td>'),
    2:re.compile(r'html">(.*?)</a></td>'),
    6:re.compile(r'">(.*?)</td>'),
    7:re.compile(r'">(.*?)</td>'),
    8:re.compile(r'">(.*?)</td>')
}

find2 = {
    1:re.compile(r'html">(.*?)</a></td>'),
    2:re.compile(r'html">(.*?)</a></td>'),
    4:re.compile(r'">(.*?)</td>'),
    5:re.compile(r'">(.*?)</td>'),
    6:re.compile(r'">(.*?)</td>')
}

find3 = {
    1:re.compile(r'="">(.*?)</span></td>'),
    2:re.compile(r'html">(.*?)</a></td>'),
    4:re.compile(r'">(.*?)</td>'),
    5:re.compile(r'">(.*?)</td>'),
    6:re.compile(r'">(.*?)</td>')
}

find4 = {
    1:re.compile(r'="">(.*?)</span></td>'),
    2:re.compile(r'<span>(.*?)</span></td>'),
    4:re.compile(r'">(.*?)</td>'),
    5:re.compile(r'">(.*?)</td>'),
    6:re.compile(r'">(.*?)</td>')
}


if __name__=="__main__":  
    funds = []
    fundNum = 0
    errorNum = 0
    send = request.Request("http://fund.eastmoney.com/js/fundcode_search.js",headers = head)
    response = request.urlopen(send)
    js = response.read().decode('utf-8')
    js = js[11:len(js)-3].split("],[")
    for i in range(0,len(js)):
        fund = str(js[i]).replace('"','')
        fund = fund.split(",")
        if fund[3] == "混合型" or fund[3] == "股票型":
            funds.append(fund)
            # print(fund)

    # print(funds)
    while fundNum < len(funds):
        fund_id = funds[fundNum][0]
        print(fund_id + " " + funds[fundNum][2])
        # fund_id = "004283"
        try:
            url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code=" + str(fund_id) + "&topline=50&year=2018&month=&rt=0.21822537857648627"
            # print(url)
            send = request.Request(url,headers = head)
            response = request.urlopen(send, timeout=10)
            html = response.read().decode('utf-8')
            # print(html)
            bs =BeautifulSoup(html,"html.parser")

            ud_arr = []
            for label in bs.find_all(attrs = {'class':'left'}, name = 'label'):
                # print(label.text.split("年"))
                t = label.text.split("年")
                str1 = t[len(t)-2]
                str2 = t[len(t)-1]
                str1 = str1[len(str1)-4:len(str1)]
                str2 = str2[0:1]
                ud_arr.append(str1 + '-' + str2)
                # print(ud_arr)

            find_list = bs.find_all("tbody")

            ud_idx = 0
            for body in find_list:
                # print(body)
                tr = body.find_all("tr")
                # print(len(tr)) 
                ud = ud_arr[ud_idx]
                ud_idx += 1
                for i in tr:
                    td = i.find_all("td")
                    fundShares = []

                    # 判断格式（9个TD，7个TD）
                    # print(len(td))
                    if len(td) == 9:
                        # print("============类型11111============")
                        for j in range(0,len(td)):
                            if j in [1,2,6,7,8]:
                                # print("j: %s" % (j))
                                # print(td[j])
                                # print(re.findall(find[j],str(td[j])))
                                aa = re.findall(find[j],str(td[j]))
                                # print(len(aa))
                                if len(aa) == 0:
                                    # print("数据格式不正确, 调整解析格式")
                                    # print(find3[j])
                                    # print(re.findall(find3[j],str(td[j])))
                                    aa = re.findall(find3[j],str(td[j]))
                                    if len(aa) == 0:
                                        aa = re.findall(find4[j],str(td[j]))
                                        if len(aa) == 0:
                                            # print(td[j])
                                            print("数据格式不正确, 调整解析格式")
                                        else:
                                            a = aa[0]
                                    else:
                                        a = aa[0]
                                else:
                                    a = aa[0]

                                # print(a)
                                if j >= 6 :
                                    a = str(a).replace("%","").replace(",","")
                                    # print(a)
                                    if(len(a)>8):
                                        time.sleep(3)
                                fundShares.append(a)
                        fundShares.append(ud)
                        fundSharesList.append(fundShares)
                    else:
                        # print("============类型22222============")
                        for j in range(0,len(td)):
                            if j in [1,2,4,5,6]:
                                # print("j: %s" % (j))
                                # print(td[j])
                                # print(re.findall(find[j],str(td[j])))
                                aa = re.findall(find2[j],str(td[j]))
                                # print(len(aa))
                                if len(aa) == 0:
                                    # print("数据格式不正确, 调整解析格式")
                                    # print(find3[j])
                                    # print(re.findall(find3[j],str(td[j])))
                                    aa = re.findall(find3[j],str(td[j]))
                                    if len(aa) == 0:
                                        aa = re.findall(find4[j],str(td[j]))
                                        if len(aa) == 0:
                                            # print(td[j])
                                            print("数据格式不正确, 调整解析格式")
                                        else:
                                            a = aa[0]
                                    else:
                                        a = aa[0]
                                else:
                                    a = aa[0]

                                # print(a)
                                if j >= 4 :
                                    a = str(a).replace("%","").replace(",","")
                                    # print(a)
                                    if(len(a)>8):
                                        time.sleep(3)
                                fundShares.append(a)
                        fundShares.append(ud)
                        fundSharesList.append(fundShares)
                # print()

            # print(fundSharesList)
            mysql.updateFundStock(fund_id, fundSharesList)
            fundSharesList = []
        except Exception as e:
            print("(%s, %s)基金持仓获取失败!" % (fund_id, funds[fundNum][1]))
            print(e)
            errorNum = errorNum + 1
            # if str(e) =="timed out" and errorNum <= 3:
            #     print("第" + str(errorNum) + "次超时，重试")
            #     errorNum = errorNum + 1
            #     fundNum = fundNum - 1
            print("共计失败次数：%s" % (errorNum))

        fundNum = fundNum + 1
        # break





